knitr::opts_chunk$set(echo = TRUE)
pacman::p_load(dplyr, ggplot2, readr, plotly, googleVis)
💡 讀完原始資料之後,先將資料壓縮起來,之後再從壓縮檔讀進會比較快、比較方便
load("data/olist.rdata")
load("data/Z.rdata")
locate<-read.csv("olist_geolocation_dataset.csv")
OP<- read.csv("olist_order_payments_dataset.csv")
PE<- read.csv("product_category_name_translation.csv")
pop<-read.csv("population.csv")
買家位置分佈長條圖tail(sort(table(C$customer_state)))
##
## SC PR RS MG RJ SP
## 3637 5045 5466 11635 12852 41746
ggplot(C, aes(x=customer_state)) +
geom_bar()
賣家位置分佈長條圖tail(sort(table(S$seller_state)))
##
## RS RJ SC MG PR SP
## 129 171 190 244 349 1849
ggplot(S, aes(x=seller_state)) +
geom_bar()
買家位置分佈圖countC = as.data.frame(table(C$customer_state))
dataC <- merge(countC, pop, by.x="Var1", by.y="state")
dataC2 <- merge(C, locate, by.x="customer_zip_code_prefix", by.y="geolocation_zip_code_prefix")
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(maptools)
## Warning: package 'maptools' was built under R version 3.5.2
## Loading required package: sp
## Checking rgeos availability: FALSE
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library(sp)
library(dplyr)
install.packages("maps", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/zj/3mrqmd7j1516zg6pbw_ps6k00000gn/T//RtmpfpQ0nh/downloaded_packages
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
Brazil<-map_data("world")%>%filter(region=="Brazil")
ggplot(Brazil,aes(long,lat))+
geom_polygon(aes(group=group),fill="white",colour="grey60")+
geom_point(data=dataC,aes(x=lng,y=lat, size = Freq, color=Var1))+
geom_text(data=dataC, mapping = aes(x = lng, y = lat, label = Var1))
賣家位置分佈圖countS = as.data.frame(table(S$seller_state))
dataS <- merge(countS, pop, by.x="Var1", by.y="state")
Brazil<-map_data("world")%>%filter(region=="Brazil")
ggplot(Brazil,aes(long,lat))+
geom_polygon(aes(group=group),fill="white",colour="grey60")+
geom_point(data=dataS,aes(x=lng,y=lat, size = Freq,color=Var1))+
geom_text(data=dataS, mapping = aes(x = lng, y = lat, label = Var1))
💡 結論
#從分析的結果來看,買家賣家的主要位置分布在SP、PR、MG、SC、RJ、RS這五個城市,當平台有活動想要增加促銷時,可以優先考慮將廣告投放在這幾個主要客群的城市增加銷量,達到業績提升的效果
buyer and payment_type分佈data <- merge(OP, O, by = "order_id")
data <- merge(data, C, by = "customer_id")
data2<-as.data.frame(data[,c(4,16)])
ggplot(data2, aes(x = payment_type, fill = payment_type )) +
geom_bar(position="dodge") + facet_grid(~ customer_state )
💡 結論
#各城市的主要payment_type皆以信用卡為主
buyer and product分佈I$order_id <- as.character(I$order_id )
data$order_id <- as.character(data$order_id )
data3 <- left_join(data, I, by = "order_id")
data3 <-left_join(data3, P, by = "product_id")
data4 <-as.data.frame(data3[,c(16,23)])
ggplot(Brazil,aes(long,lat))+
geom_polygon(aes(group=group),fill="white",colour="grey60")+
geom_point(data=pop,aes(x=lng,y=lat, color=max_product))+
geom_text(data=pop, mapping = aes(x = lng, y = lat, label = state))
💡 結論
#從分析的結果來看,可以發現北巴西買家主要購買的產品為健康美容產品,南巴西則以寢具用品為主,休閒運動為次要的需求,各地區的賣家可以依此結果了解買家的喜好,來調整販售的產品(ex.像是大型傢俱廠商可以盡量在南巴西設廠,以減少鉅額運費的產生)
付款模式分析與行銷#同一筆訂單會使用多重的付款方式
nrow(unique(OP[, 1:3])) == nrow(OP)
## [1] TRUE
#
nrow(unique(I[, 1:3])) == nrow(I)
## [1] TRUE
1.付款方式
一筆訂單最多使用了多少種付款方式?
table(OP$order_id) %>% table()
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 96479 2382 301 108 52 36 28 11 9 5 8 8
## 13 14 15 19 21 22 26 29
## 3 2 2 2 1 1 1 1
#96479筆訂單只使用一種付款方式
每一筆訂單付款方式?
X = unique(I[, 1:3]) %>% left_join(OP) %>% left_join(O[,c(1,4)]) %>% left_join(P[, 1:2]) %>% left_join(PE)
## Joining, by = "order_id"
## Warning: Column `order_id` joining character vector and factor, coercing
## into character vector
## Joining, by = "order_id"
## Joining, by = "product_id"
## Joining, by = "product_category_name"
## Warning: Column `product_category_name` joining character vector and
## factor, coercing into character vector
pie <- ggplot(X, aes(x=factor(1), fill=payment_type))+
geom_bar(width = 1)+
coord_polar("y", start = 0)+
labs(x="", y="", title = "Payment Type")
pie + scale_fill_brewer(palette = "Blues")+
theme_minimal()
💡 結論
#在此巴西電商平台中,消費者使用最頻繁的付款方式為信用卡消費,次之為Boleto支付,第三為使用兌換卷購買。
2.不同付費方式之訂單中,購買的產品類別、分期數與價格間關係之分析。
不同付款方式的分期付款數?
table(X$payment_installments)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 3 58617 13722 11756 7979 6017 4617 1828 5063 726 6845 25
## 12 13 14 15 16 17 18 20 21 22 23 24
## 163 18 16 92 7 7 38 21 5 1 1 34
ggplot(X, aes(x=payment_type, fill=factor(payment_installments)))+
geom_bar(position = "stack")
ggplot(X, aes(x=factor(payment_installments), group=payment_type))+
geom_line(aes(color=payment_type), position = "identity", stat = "count")
💡 結論
#使用信用卡支付的分期付款數多為1至3次分期付款數;其他支付方式的訂單皆為一次繳清。
每一筆訂單購買的產品類別,所對應到的付款方式?
product_pay <- ggplot(X, aes(x=product_category_name, fill=payment_type))+
geom_bar(position = "fill")+
coord_flip()+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) #座標軸轉換90度
product_pay
💡 結論
#所有產品種類皆以信用卡付款為大宗
不同付款方式與價格之關係。
ggplot(X, aes(x=payment_type, y=payment_value))+
geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
ggplot(X, aes(x=payment_type, y=log(payment_value)))+
geom_boxplot()
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
💡 結論
#使用兌換卷消費的訂單交易成本中位數較低。
#其他無明顯差異。
不同分期付款次數下的交易成本? # {r} # statistic <- group_by(X, payment_installments) %>% # summarise( # noOrder = n(), # avgpay = mean(payment_value) # ) # statistic # ggplot(statistic, aes(x=payment_installments, y= avgpay))+ # geom_point(color= "#993333", size= 3,) #
💡 結論
#分期次數越高,單筆平均交易成本越高。
目的: a.推斷時間最長之貨物種類 與最短之貨物種類。} b.推斷貨物體積與寄送時間差異。 c.推斷貨物重量與寄送時間差異。 {d.推斷貨品寄送時間與評價的差異。}
library(ggplot2)
library(dplyr)
library(readr)
###新增一列deltime計算時間差
#轉換資料格式
O$order_delivered_customer_date= as.POSIXct(as.character(O$order_delivered_customer_date),format="%Y-%m-%d %H:%M")
O$order_purchase_timestamp= as.POSIXct(as.character(O$order_purchase_timestamp),format="%Y-%m-%d %H:%M")
#在O中新增一列 運送時間 O1為增加欄位後資料
O1= mutate(O, deltime=order_delivered_customer_date - order_purchase_timestamp) %>% arrange(desc(deltime))
#deltime 時間差(hr)
###a推斷時間最長之貨物種類 與最短之貨物種類
#a:推斷時間最長之貨物種類 與最短之貨物種類
O2<-O1 %>% left_join(I) %>% left_join(P) #併表 加入product category
## Joining, by = "order_id"
## Joining, by = "product_id"
aO2= select(O2,contains("deltime"),contains("category"))%>%arrange(desc(deltime)) #aO2剩下deltime時間差及產品類+依時間差做排序
#aO2h<- filter(aO2,deltime>2000) #有104件
#aO2l<- filter(aO2,deltime<30)#198件
table(filter(aO2,deltime>2000)$product_category_name)%>%sort()%>%tail(3)
##
## automotivo esporte_lazer moveis_decoracao
## 8 8 12
table(filter(aO2,deltime<30)$product_category_name)%>%sort()%>%tail(3)
##
## informatica_acessorios beleza_saude esporte_lazer
## 18 20 23
###以下分佈是為了查看、驗證貨物種類與寄送時間的關係
#informatica_acessorios的時間分布
aOia= filter(aO2,product_category_name=="informatica_acessorios")
ggplot(aOia,aes(x = product_category_name, y =deltime))+
geom_jitter()
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 184 rows containing missing values (geom_point).
#Automotivo的時間分布
aOa= filter(aO2,product_category_name=="automotivo")
ggplot(aOa,aes(x = product_category_name, y = deltime))+
geom_jitter()
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 96 rows containing missing values (geom_point).
#Moveis_decoracao的時間分布
aOmd= filter(aO2,product_category_name=="moveis_decoracao")
ggplot(aOmd,aes(x = product_category_name, y = deltime))+
geom_jitter()
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 174 rows containing missing values (geom_point).
###b 推斷貨物體積與寄送時間差異
#先前的O2中已有長寬高資料
bO2= mutate(O2, volume=product_length_cm*product_height_cm*product_width_cm) %>%
select(contains("volume"),contains("deltime"))
bO2=arrange(bO2,desc(deltime))
ggplot(bO2,aes(x = volume, y = deltime)) +
geom_jitter(colour = "orange",alpha=0.5)+ #貨物體積與寄送時間差關係圖(只有體小的才會送比較久,大的反而還好)
xlab("volumes") + ylab("delivery time")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3247 rows containing missing values (geom_point).
lm(volume~deltime,data=bO2) %>% summary #p value<2.2e-16 小於0.005 有顯著關係
##
## Call:
## lm(formula = volume ~ deltime, data = bO2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52772 -12066 -8432 2980 281828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.277e+04 1.156e+02 110.48 <2e-16 ***
## deltime 8.020e+00 3.079e-01 26.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23170 on 110176 degrees of freedom
## (3247 observations deleted due to missingness)
## Multiple R-squared: 0.006122, Adjusted R-squared: 0.006113
## F-statistic: 678.6 on 1 and 110176 DF, p-value: < 2.2e-16
###c.推斷貨物重量與寄送時間差異
#先前的O2中已有重量資料
cO2= select(O2,contains("weight"),contains("deltime")) %>%
arrange(desc(deltime))
ggplot(cO2,aes(x = product_weight_g, y = deltime))+
geom_jitter(colour = "orange",alpha=0.5)+#貨物重量與寄送時間差關係圖
xlab("product weight(g)") + ylab("delivery time")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3247 rows containing missing values (geom_point).
lm(product_weight_g~deltime,data=cO2) %>% summary#p value<2.2e-16 小於0.005 有顯著關係
##
## Call:
## lm(formula = product_weight_g ~ deltime, data = cO2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7909 -1719 -1342 -261 38279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.666e+03 1.860e+01 89.57 <2e-16 ***
## deltime 1.415e+00 4.954e-02 28.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3728 on 110176 degrees of freedom
## (3247 observations deleted due to missingness)
## Multiple R-squared: 0.007348, Adjusted R-squared: 0.007339
## F-statistic: 815.5 on 1 and 110176 DF, p-value: < 2.2e-16
###d推斷貨品寄送時間與評價的差異
O3<- O1 %>% left_join(R)
## Joining, by = "order_id"
dO3<- select(O3,contains("score"),contains("deltime")) %>%
arrange(desc(deltime))
ggplot(dO3,aes(x = review_score, y = deltime)) +
geom_jitter(colour = "orange",alpha=0.5)+
geom_hline(aes(yintercept=5000), colour="red", linetype="dashed")+
xlab("review score") + ylab("delivery time")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 2987 rows containing missing values (geom_point).
💡 結論
#無明顯差異(運送時間可能只是其中一項影響評價的因素)
# 若運送時間在1500hrs以上,仍沒有辦法明顯看出會落在低評價區域,但運送時間5000hrs以上的店家均落在評分1及2
###(a)限制店家寄出時間與實際店家寄出時間差異與關係,然後再與商品類別作結合,判斷出哪些產品廠商容易超過時間寄送。
A1 = merge(I,O,by="order_id") # #讓有限制店家寄出時間和實際寄送到物流的時間合併
A2 = merge(P,TPC, by="product_category_name") #合併都有商品類別工作表
A3 = merge(A1,A2,by="product_id") #合併時間差異和有商品類別工作表
rm.A3=A3[complete.cases(A3), ] #這時候再統一剃除NA缺漏值
#算出shipping_limit_date 和order_delivered_carrier_date差異 #再分級距 分別是沒有超過時間、1星期內、1星期以上、1個月以上
#我們將兩個寄出時間從"字串"轉為"日期" 這時候單位是"days 天數"
order_delivered_carrier_date=as.Date(rm.A3$order_delivered_carrier_date,format= "%Y-%m-%d")
shipping_limit_date=as.Date(rm.A3$shipping_limit_date,format= "%Y-%m-%d")
#兩個寄出時間相減並命名為overtime 意思是超時
overtime=(order_delivered_carrier_date-shipping_limit_date)
#再分別將將overtime超時訂單時間由大到小印出來
overtime=as.numeric(overtime) #弄成數值
overtime=as.data.frame(overtime)#弄成資料框
Overtime=cut(overtime$overtime,breaks=c(-Inf,0,7,30,Inf),labels=c("on time","1~7","7~30","above 30"),right = F)
#再分級距 分別是沒有超過時間、1星期內、1星期以上、1個月以上
A4=mutate(rm.A3,Overtime=cut(overtime$overtime,breaks=c(-Inf,0,7,30,Inf),labels=c("<=0","1~7","7~30","above 30"),right = F))
ggplot(A4, aes(product_category_name_english,fill=Overtime),
main="Time difference ",xlab="product_category_name_english",ylab="count") + geom_bar(position="fill")+
coord_flip()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
💡 結論
#可以看出大部分訂單是沒有超過時間的
#算出shipping_limit_date 和order_delivered_carrier_date差異 ##弄級距 有超時7天內 7天~1個月 1個月以上 (未包含準時訂單)
#我們將兩個寄出時間從"字串"轉為"日期" 這時候單位是"days 天數"
order_delivered_carrier_date=as.Date(rm.A3$order_delivered_carrier_date,format= "%Y-%m-%d")
shipping_limit_date=as.Date(rm.A3$shipping_limit_date,format= "%Y-%m-%d")
#兩個寄出時間相減並命名為overtime 意思是超時
overtime = (order_delivered_carrier_date-shipping_limit_date)
#再分別將將overtime超時訂單時間篩選出
overtime1 = subset(overtime,overtime>0) %>% sort(decreasing = T)
overtime2 = as.numeric(overtime1)
overtime3 = as.data.frame(overtime2)
#哪些是超時寄出的商品類別? 命名為product_category_name_english
product_category_name_english=rm.A3$product_category_name_english[which(order_delivered_carrier_date>shipping_limit_date)]
Overtime=cut(overtime3$overtime2,breaks=c(0,7,30,Inf),labels=c("1~7","7~30","above 30"),right = F) #分級距 有超時7天內 7天~1個月 1個月以上
#建立一個新的資料框
A5=data.frame(product_category_name_english=product_category_name_english,Overtime=Overtime)
ggplot(A5, aes(product_category_name_english,fill=Overtime) )+ geom_bar(position="fill")+
labs(title="The relationship between product_category and Overtime day",x="product_category_name_english",y="count",fill="Overtime") +
coord_flip()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
#取Top10超時商品類別(資料級距未包含準時訂單)
Top10<-table(A5$product_category_name_english)%>%sort%>%tail(10)
A6<-filter(A5,A5$product_category_name_english%in%names(Top10))
#取前十大超時訂單數最多的
B2<-ggplot(A6,aes(x=product_category_name_english,fill=Overtime))+geom_bar()+ theme(axis.text.x=element_text(angle=45, hjust=1))
#可看出各級距的實際訂單數
B3<-ggplot(A6,aes(x=product_category_name_english,fill=Overtime))+geom_bar(position="fill")+ theme(axis.text.x=element_text(angle=45, hjust=1))
#可看出各級距的比例
B2
B3
###(b)超時寄送的產品評價(包含準時、超時訂單)
#把產品評論R剔除缺漏值
rm.R=R[complete.cases(R), ]
#我們再將rm.A3、rm.R合併工作表,命成為R1
R1=merge(rm.A3,rm.R,by="order_id")
#我們將R1轉換成日期,相減再命命為
R1$order_delivered_carrier_date<-as.Date(R1$order_delivered_carrier_date,format= "%Y-%m-%d")
R1$shipping_limit_date<-as.Date(R1$shipping_limit_date,format= "%Y-%m-%d")
R1$overtime=(R1$order_delivered_carrier_date-R1$shipping_limit_date)
#我們將R1資料框當中包含"評分score"、"超時overtime"的欄位選出來
#命名為R2
#再用ggplot作圖
#可以看出
R2<- select(R1,contains("score"),contains("overtime"))
ggplot(R2,aes(x = review_score, y = overtime)) +
geom_jitter(color="993339")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
###(b)超時寄送的產品評價(只包含超時訂單)
#把產品評論R剔除缺漏值
rm.R=R[complete.cases(R), ]
#我們再將rm.A3、rm.R合併工作表,命成為R1
R1=merge(rm.A3,rm.R,by="order_id")
R1$order_delivered_carrier_date<-as.Date(R1$order_delivered_carrier_date,format= "%Y-%m-%d")
R1$shipping_limit_date<-as.Date(R1$shipping_limit_date,format= "%Y-%m-%d")
R1$overtime=(R1$order_delivered_carrier_date-R1$shipping_limit_date)
#我們將R1資料框當中包含"評分score"、"超時overtime"的欄位選出來
#命名為R2
#再用ggplot作圖
#可以看出
R2<- select(R1,contains("score"),contains("overtime"))
ggplot(R2,aes(x = review_score, y = overtime)) +
geom_jitter(color="993339")+
ylim(c(0,30))+
geom_hline(aes(yintercept=10),colour="black",linetype="dashed")
## Warning: Removed 10263 rows containing missing values (geom_point).
#可以看出越容易超時的產品 通常評分越低
lm(review_score~ overtime,data=R2)%>%summary()#且我們可以用迴歸分析來看
##
## Call:
## lm(formula = review_score ~ overtime, data = R2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7838 -0.9057 1.0943 1.2348 1.7968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.659803 0.020093 182.142 < 2e-16 ***
## overtime -0.035124 0.004327 -8.118 5.25e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.574 on 11326 degrees of freedom
## Multiple R-squared: 0.005784, Adjusted R-squared: 0.005697
## F-statistic: 65.9 on 1 and 11326 DF, p-value: 5.246e-16
#P-value<0.05 有顯著性
#(C)產品超時寄出是否與產品價格有關聯 (包含準時、超時訂單)
ggplot(rm.A3,aes(x=overtime,y=price ))+
geom_point(alpha = 1/5,position = 'jitter',stat="identity",color="993338")+
xlim(c(-500,100))+
ylim(c(0,7000)) #未排除沒有超時overtime的資料(包含overtime<0 未超時的產品)
## Warning: Removed 2 rows containing missing values (geom_point).
lm(price~ overtime,data=rm.A3) %>% summary()
##
## Call:
## lm(formula = price ~ overtime, data = rm.A3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.4 -80.2 -45.5 13.9 6613.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.22009 0.63980 189.466 < 2e-16 ***
## overtime 0.33792 0.09426 3.585 0.000337 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 182 on 108619 degrees of freedom
## Multiple R-squared: 0.0001183, Adjusted R-squared: 0.0001091
## F-statistic: 12.85 on 1 and 108619 DF, p-value: 0.0003374
#且P-value=0.000337<0.05 產品超時寄出、產品價格有顯著性
###(C)產品超時寄出是否與產品價格有關聯 (只含超時訂單)
ggplot(rm.A3,aes(overtime,price))+
geom_point(alpha = 1/5,position = 'jitter',stat="identity",color="993338")+
xlim(c(0,100))+
ylim(c(0,7000)) #已排除無超時overtime數據(x軸下限=0),
## Warning: Removed 98502 rows containing missing values (geom_point).
#目測出價格越便宜產品,越容易超時
#我們再進一步往下看
lm(price~ overtime,data=rm.A3)%>%summary()
##
## Call:
## lm(formula = price ~ overtime, data = rm.A3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.4 -80.2 -45.5 13.9 6613.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.22009 0.63980 189.466 < 2e-16 ***
## overtime 0.33792 0.09426 3.585 0.000337 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 182 on 108619 degrees of freedom
## Multiple R-squared: 0.0001183, Adjusted R-squared: 0.0001091
## F-statistic: 12.85 on 1 and 108619 DF, p-value: 0.0003374
#且P-value=0.000337<0.05
#產品超時寄出、產品價格有顯著性
###(C)產品超時寄出是否與產品價格有關聯(只含超時訂單且在價格500畫一輔助線)
ggplot(rm.A3,aes(overtime,price))+
geom_point(alpha = 1/5,position = 'jitter',stat="identity",color="993338")+
xlim(c(0,100))+
ylim(c(0,7000)) +
geom_hline(aes(yintercept=500),colour="black",linetype="dashed")
## Warning: Removed 98535 rows containing missing values (geom_point).
#已排除overtime(x軸下限=0),且把價格訂500當輔助線
#目測出價格越便宜的商品,越容易超時
lm(price~ overtime,data=rm.A3)%>%summary()
##
## Call:
## lm(formula = price ~ overtime, data = rm.A3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.4 -80.2 -45.5 13.9 6613.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.22009 0.63980 189.466 < 2e-16 ***
## overtime 0.33792 0.09426 3.585 0.000337 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 182 on 108619 degrees of freedom
## Multiple R-squared: 0.0001183, Adjusted R-squared: 0.0001091
## F-statistic: 12.85 on 1 and 108619 DF, p-value: 0.0003374
#且P-value=0.000337<0.05 產品超時寄出、產品價格有顯著性
load("data/olist.rdata")
library(plotly)
install.packages("devtools", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/zj/3mrqmd7j1516zg6pbw_ps6k00000gn/T//RtmpfpQ0nh/downloaded_packages
install.packages("Rmisc", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/zj/3mrqmd7j1516zg6pbw_ps6k00000gn/T//RtmpfpQ0nh/downloaded_packages
library(Rmisc)
## Loading required package: lattice
library(dplyr)
library(ggplot2)
library(magrittr)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.2
library(devtools)
## Warning: package 'devtools' was built under R version 3.5.2
install_github("cttobin/ggthemr",force=TRUE)
## Downloading GitHub repo cttobin/ggthemr@master
##
##
checking for file ‘/private/var/folders/zj/3mrqmd7j1516zg6pbw_ps6k00000gn/T/RtmpfpQ0nh/remotesc3b7b3a813c/cttobin-ggthemr-10eb50a/DESCRIPTION’ ...
✔ checking for file ‘/private/var/folders/zj/3mrqmd7j1516zg6pbw_ps6k00000gn/T/RtmpfpQ0nh/remotesc3b7b3a813c/cttobin-ggthemr-10eb50a/DESCRIPTION’ (357ms)
##
─ preparing ‘ggthemr’:
##
checking DESCRIPTION meta-information ...
✔ checking DESCRIPTION meta-information
##
─ checking for LF line-endings in source and make files and shell scripts
##
─ checking for empty or unneeded directories
## Removed empty directory ‘ggthemr/README_files’
##
─ building ‘ggthemr_1.1.0.tar.gz’
##
##
library(ggthemr)
ggthemr("sea")
###2.評論系統改善(加強顧客填表單的比率與速度,給予回饋或是其他行銷) ###a.填表速度與寄送時間的關係,填表的時間與評論之間的關係。(會不會因為填寫時間長是因為體驗時間導致,說不定填寫越久,體驗越久,評分越高)可以探討進行行銷活動促使評論時間縮短,或是給予顧客賞味期再寄出滿意表。
###每個產品的評論
P1<-left_join(P,TPC,by="product_category_name")
I1<-I[,c(1,3)]
P2<-merge(P1,I1,by="product_id",all.x=T)
PandR<-merge(R,P2,by="order_id",all.x=T,all.y=T)
par(cex=1,mar=c(3,3,5,2),cex.axis=5)
ggplot(PandR,aes(x=PandR$product_category_name_english,fill = PandR$review_score),width=1) +
geom_bar(aes(fill=factor(review_score)),position= "fill")+ coord_flip()
###每個產品的運送時間
#日期轉化,計算寄出到回答完畢的時間
PandR$review_creation_date<-as.Date(PandR$review_creation_date,format="%Y-%m-%d")
PandR$review_answer_timestamp<-as.Date(PandR$review_answer_timestamp,format="%Y-%m-%d")
PandR$diff<-PandR$review_answer_timestamp-PandR$review_creation_date
table(PandR$diff)%>%table
## .
## 1 2 3 4 5 6 8 9 10 11 12 13
## 79 26 19 11 13 7 3 3 1 4 2 1
## 14 15 18 20 23 25 27 31 39 45 47 51
## 1 6 2 1 3 1 1 1 1 1 3 1
## 65 73 94 99 101 114 158 159 170 261 293 334
## 1 1 1 1 1 1 1 1 1 1 1 1
## 423 522 735 1057 2026 3176 5284 16185 18348 27886 35468
## 1 1 1 1 1 1 1 1 1 1 1
PandR$diff<-as.numeric(PandR$diff)
PandR$diff1<-cut(PandR$diff,breaks=c(-1,0,3,30,Inf),labels=c("0","1~3","3~30","above 30"))
par(cex=1,mar=c(3,3,5,2),cex.axis=5)
ggplot(PandR,aes(x=PandR$product_category_name_english,fill = factor(PandR$diff1)),width=1) +
geom_bar(position= "fill")+ coord_flip()
#日期轉化,計算寄出到回答完畢的時間
R$review_creation_date<-as.Date(R$review_creation_date,format="%Y-%m-%d")
R$review_answer_timestamp<-as.Date(R$review_answer_timestamp,format="%Y-%m-%d")
R$diff<-R$review_answer_timestamp-R$review_creation_date
table(R$diff)%>%table
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 83 28 17 15 11 4 1 2 3 3 1 3
## 14 15 16 17 20 21 22 23 24 30 39 42
## 2 4 1 1 1 1 1 1 2 1 1 1
## 43 45 46 52 65 80 81 89 103 134 155 233
## 1 1 1 1 1 1 1 1 1 2 1 1
## 258 291 367 423 622 916 1719 2721 4636 14145 16010 24620
## 1 1 1 1 1 1 1 1 1 1 1 1
## 31190
## 1
R$diff<-as.numeric(R$diff)
R$diff1<-cut(R$diff,breaks=c(-1,0,3,30,Inf),labels=c("0","1~3","3~30","above 30"))
P3<-P1[,c(1,10)]
n_distinct(I$product_id)
## [1] 32951
I1<-left_join(I,P,by="product_id")
RandI<-merge(R,I1,by="order_id",all.x=T,all.y=T)
duplicated(RandI$order_id)%>%table
## .
## FALSE TRUE
## 99441 14659
RandI<-left_join(RandI,TPC,by="product_category_name")
RandI$review_score<-as.numeric(RandI$review_score)
#每個評分高低含有多少比例的體驗天數
f<-ggplot(RandI,aes(review_score))+geom_bar(aes(fill=factor(diff1)),position="fill")+
labs(title="The relationship between the review score and experience day",x="Review",y="probability",fill="Days") +
scale_fill_brewer(palette="Black")+theme_bw()
## Warning in pal_name(palette, type): Unknown palette Black
ggplotly(f)
#每個體驗天數間距的評分高低分布
f1<-ggplot(RandI,aes(diff1))+geom_bar(aes(fill=factor(review_score)),position="fill")+
labs(title="The relationship between the review score and experience day",x="Days",y="probability",fill="Days")+theme_bw()
ggplotly(f1)
lm(RandI$review_score~ RandI$diff) %>%summary
##
## Call:
## lm(formula = RandI$review_score ~ RandI$diff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6174 -0.9967 0.9985 1.0021 1.0033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9967274 0.0043279 923.48 < 2e-16 ***
## RandI$diff 0.0011982 0.0004341 2.76 0.00577 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.412 on 114098 degrees of freedom
## Multiple R-squared: 6.678e-05, Adjusted R-squared: 5.802e-05
## F-statistic: 7.62 on 1 and 114098 DF, p-value: 0.005774
#由回歸來看,天數高低的效果對於平輪的高低是沒有顯著影響的。
###b.有文字評論的比率,更進階推論哪些會比較容易有文字評論,而那些較少。 產品種類與評論關係 哪些產品常常給予評論
is.na(RandI$review_comment_message)%>%table
## .
## FALSE TRUE
## 49127 64973
#將NA變成0,將不是NA的變成1
RandI$review_comment_message[!is.na(RandI$review_comment_message)]<-TRUE
RandI$review_comment_title[!is.na(RandI$review_comment_title)]<-TRUE
RandI$review_comment_message[is.na(RandI$review_comment_message)] <- FALSE
RandI$review_comment_title[is.na(RandI$review_comment_title)] <- FALSE
#總填寫率與總填寫率
table(RandI$review_comment_title)
##
## FALSE TRUE
## 100388 13712
table(RandI$review_comment_message)
##
## FALSE TRUE
## 64973 49127
table(RandI$review_comment_title==F&RandI$review_comment_message==F)
##
## FALSE TRUE
## 51035 63065
#前10大銷量產品填寫率
Top10<-table(RandI$product_category_name_english)%>%sort%>%tail(10)
RandI1<-filter(RandI,RandI$product_category_name_english%in%names(Top10))
f2<-ggplot(RandI1,aes(x=product_category_name_english,fill=review_comment_message))+geom_bar()+ theme(axis.text.x=element_text(angle=45, hjust=1))
f3<-ggplot(RandI1,aes(x=product_category_name_english,fill=review_comment_message))+geom_bar(position="fill")+ theme(axis.text.x=element_text(angle=45, hjust=1))
multiplot(f2,f3)
table(RandI1$product_category_name_english,RandI1$review_comment_message)
##
## FALSE TRUE
## auto 2347 1909
## bed_bath_table 5895 5377
## computers_accessories 4559 3336
## furniture_decor 4755 3661
## garden_tools 2300 2061
## health_beauty 5894 3834
## housewares 4019 2970
## sports_leisure 5204 3497
## telephony 2466 2084
## watches_gifts 3258 2743
duplicated(R$review_id)%>%table
## .
## FALSE TRUE
## 99173 827
duplicated(R$order_id)%>%table
## .
## FALSE TRUE
## 99441 559
table(R$review_id) %>% table
## .
## 1 2 3
## 98371 777 25
table(R$order_id) %>% table
## .
## 1 2 3
## 98886 551 4
table(R$review_id)%>%sort%>%tail(26)
##
## ffb8cff872a625632ac983eb1f88843c 08528f70f579f0c830189efc523d2182
## 2 3
## 0c76e7a547a531e7bf9f0b99cba071c1 1fb4ddc969e6bea80e38deec00393a6f
## 3 3
## 2172867fd5b1a55f98fe4608e1547b4b 2d6ac45f859465b5c185274a1c929637
## 3 3
## 308316408775d1600dad81bd3184556d 32415bbf6e341d5d517080a796f79b5c
## 3 3
## 3415c9f764e478409e8e0660ae816dd2 38821b5c496b678cf91acc34892805ad
## 3 3
## 39b4603793c1c7f5f36d809b4a218664 4219a80ab469e3fc9901437b73da3f75
## 3 3
## 44e9f871226d8a130de3fc39dfbdf0c5 4548534449b1f572e357211b90724f1b
## 3 3
## 4d0e6dd087008d1f992d25ef6e1f619f 69a1068c3128a14994e3e422e4539e04
## 3 3
## 70509c441d994fa03d6c1457930c9024 7b606b0d57b078384f0b58eac1d41d78
## 3 3
## 832acec9bbf4efe65c3fb6423d8b4ed7 9e25d6e3025e9b9a0fc7f03588d33e2b
## 3 3
## abbfacb2964f74f6487c9c10ac46daa6 c444278834184f72b1484dfe47de7f97
## 3 3
## dbdf1ea31790c8ecfcc6750525661a9b ddc52555ca27b0fe67d5255147682d2d
## 3 3
## e44840754f12fad2b8646712121b349a f4bb9d6dd4fb6dcc2298f0e7b17b8e1e
## 3 3
資料期間四個季度的銷售總額擷取/合併資料
orderPrice = select(I,"order_id","price")
orderTime = select(O,"order_id","order_purchase_timestamp")
order = merge(orderPrice, orderTime, by="order_id")
日期轉換、增加季度欄位
order$order_purchase_timestamp = as.Date(order$order_purchase_timestamp, format ="%Y-%m-%d")
order$month = format(order$order_purchase_timestamp, format = "%m")
#order$quarter =
#order = mutate(order,month)
order$quarter[order$month %in% c("01","02","03")] = "Q1"
order$quarter[order$month %in% c("04","05","06")] = "Q2"
order$quarter[order$month %in% c("07","08","09")] = "Q3"
order$quarter[order$month %in% c("10","11","12")] = "Q4"
計算四個季度的銷售總額
#salesOfQuarter = group_by(order,quarter) %>%
# summarise(amount = n(),
# sales = sum(price))
#salesOfQuarter
#ggplot(salesOfQuarter,aes(x=quarter, y=sales, group = 1))+
# geom_line(col= "red")
前十大類型商品的季度銷售額擷取/合併資料
productID = select(I,"order_id","product_id")
productCategory = select(P,"product_id","product_category_name")
product = merge(productID, productCategory ,by="product_id")
A = merge(order, product ,by="order_id")
選出總銷售額前十大種類商品
tail(sort(tapply(A$price, A$product_category_name, sum)),10)
## cool_stuff ferramentas_jardim automotivo
## 684197.9 687874.9 734323.6
## utilidades_domesticas moveis_decoracao esporte_lazer
## 872871.8 1138493.8 1179385.7
## relogios_presentes informatica_acessorios cama_mesa_banho
## 1310710.7 1365456.7 1420357.7
## beleza_saude
## 1434669.3
選出總銷售額前十大種類商品
tail(sort(tapply(A$price, A$product_category_name, sum)),10)
## cool_stuff ferramentas_jardim automotivo
## 684197.9 687874.9 734323.6
## utilidades_domesticas moveis_decoracao esporte_lazer
## 872871.8 1138493.8 1179385.7
## relogios_presentes informatica_acessorios cama_mesa_banho
## 1310710.7 1365456.7 1420357.7
## beleza_saude
## 1434669.3
###計算「前十大種類商品在每個季度總銷售額」並製作新的資料框 top1:beleza_saude
beleza_saude = subset(A,product_category_name %in% "beleza_saude")
tapply(beleza_saude$price,beleza_saude$quarter,sum)
## Q1 Q2 Q3 Q4
## 346760.2 461875.6 419852.9 206180.7
top2:cama_mesa_banho
cama_mesa_banho = subset(A,product_category_name %in% "cama_mesa_banho")
tapply(cama_mesa_banho$price,cama_mesa_banho$quarter,sum)
## Q1 Q2 Q3 Q4
## 353637.6 409276.3 388643.3 268800.5
top3:informatica_acessorios
informatica_acessorios = subset(A,product_category_name %in% "informatica_acessorios")
tapply(informatica_acessorios$price,informatica_acessorios$quarter,sum)
## Q1 Q2 Q3 Q4
## 481820.0 371371.5 269515.9 242749.3
top4:relogios_presentes
relogios_presentes = subset(A,product_category_name %in% "relogios_presentes")
tapply(relogios_presentes$price,relogios_presentes$quarter,sum)
## Q1 Q2 Q3 Q4
## 312307.0 435709.8 307641.4 255052.6
top5:esporte_lazer
esporte_lazer = subset(A,product_category_name %in% "esporte_lazer")
tapply(esporte_lazer$price,esporte_lazer$quarter,sum)
## Q1 Q2 Q3 Q4
## 364685.8 319869.7 286887.9 207942.3
top6:moveis_decoracao
moveis_decoracao = subset(A,product_category_name %in% "moveis_decoracao")
tapply(moveis_decoracao$price,moveis_decoracao$quarter,sum)
## Q1 Q2 Q3 Q4
## 315014.3 321748.9 271204.5 230526.1
top7:utilidades_domesticas
utilidades_domesticas = subset(A,product_category_name %in% "utilidades_domesticas")
tapply(utilidades_domesticas$price,utilidades_domesticas$quarter,sum)
## Q1 Q2 Q3 Q4
## 194327.4 320048.9 251615.3 106880.2
top8:automotivo
automotivo = subset(A,product_category_name %in% "automotivo")
tapply(automotivo$price,automotivo$quarter,sum)
## Q1 Q2 Q3 Q4
## 180421.5 232409.2 208157.6 113335.3
top9:ferramentas_jardim
ferramentas_jardim = subset(A,product_category_name %in% "ferramentas_jardim")
tapply(ferramentas_jardim$price,ferramentas_jardim$quarter,sum)
## Q1 Q2 Q3 Q4
## 180176.3 194672.6 144528.2 168497.8
top10:cool_stuff
cool_stuff = subset(A,product_category_name %in% "cool_stuff")
tapply(cool_stuff$price,cool_stuff$quarter,sum)
## Q1 Q2 Q3 Q4
## 161978.5 188233.7 180214.3 153771.4
製作新的資料框
top10 = (names(tail(sort(tapply(A$price, A$product_category_name, sum)),10)))
TOP10 = subset(A,product_category_name %in% top10)
Category <- c("cool_stuff","cool_stuff","cool_stuff","cool_stuff",
"garden_tools","garden_tools","garden_tools","garden_tools",
"auto","auto","auto","auto",
"housewares","housewares","housewares","housewares",
"furniture_decor","furniture_decor","furniture_decor","furniture_decor",
"sports_leisure", "sports_leisure", "sports_leisure", "sports_leisure",
"watches_gifts","watches_gifts","watches_gifts","watches_gifts",
"computers_accessories","computers_accessories","computers_accessories","computers_accessories",
"bed_bath_table","bed_bath_table","bed_bath_table","bed_bath_table",
"health_beauty","health_beauty","health_beauty","health_beauty")
Quarter <- c("Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4",
"Q1","Q2","Q3","Q4")
Price <- c("161979","188234","180214","153771",
"180176","194673","144528","168498",
"180421","232409","208158","113335",
"194327","320049","251615","106880",
"315014","321749","271205","230526",
"364686","319870","286888","207942",
"312307","435710","307641","255053",
"481820","371371","269516","242749",
"353638","409276","388643","268800",
"346760","461876","419853","206181")
price <- as.numeric(Price)
B = data.frame(Category,Quarter,price)
B
## Category Quarter price
## 1 cool_stuff Q1 161979
## 2 cool_stuff Q2 188234
## 3 cool_stuff Q3 180214
## 4 cool_stuff Q4 153771
## 5 garden_tools Q1 180176
## 6 garden_tools Q2 194673
## 7 garden_tools Q3 144528
## 8 garden_tools Q4 168498
## 9 auto Q1 180421
## 10 auto Q2 232409
## 11 auto Q3 208158
## 12 auto Q4 113335
## 13 housewares Q1 194327
## 14 housewares Q2 320049
## 15 housewares Q3 251615
## 16 housewares Q4 106880
## 17 furniture_decor Q1 315014
## 18 furniture_decor Q2 321749
## 19 furniture_decor Q3 271205
## 20 furniture_decor Q4 230526
## 21 sports_leisure Q1 364686
## 22 sports_leisure Q2 319870
## 23 sports_leisure Q3 286888
## 24 sports_leisure Q4 207942
## 25 watches_gifts Q1 312307
## 26 watches_gifts Q2 435710
## 27 watches_gifts Q3 307641
## 28 watches_gifts Q4 255053
## 29 computers_accessories Q1 481820
## 30 computers_accessories Q2 371371
## 31 computers_accessories Q3 269516
## 32 computers_accessories Q4 242749
## 33 bed_bath_table Q1 353638
## 34 bed_bath_table Q2 409276
## 35 bed_bath_table Q3 388643
## 36 bed_bath_table Q4 268800
## 37 health_beauty Q1 346760
## 38 health_beauty Q2 461876
## 39 health_beauty Q3 419853
## 40 health_beauty Q4 206181
前十大類型商品佔四季銷售總額
c1=ggplot(B,aes(x=Quarter,y=price, fill=Category))+
geom_bar(stat="identity")
ggplotly(c1)
前十大類型商品佔四季銷售比例
c2=ggplot(B,aes(x=Quarter,y=price, fill=Category))+
geom_bar(stat="identity",position="fill")
ggplotly(c2)
前十大類型商品的季度銷售額
B$Category = factor(B$Category, levels=c("health_beauty","bed_bath_table","computers_accessories","watches_gifts","sports_leisure","furniture_decor","housewares","auto","garden_tools","cool_stuff"))
b1=ggplot(B,aes(x=Category,y=price,fill=Quarter, group = factor(1)))+
geom_bar(stat="identity")+coord_flip()
ggplotly(b1)
前十大類型商品的季度銷售比例
b2=ggplot(B,aes(x=Category,y=price,fill=Quarter, group = factor(1)))+
geom_bar(stat="identity",position="fill")+coord_flip()
ggplotly(b2)
💡 結論
#‧平台銷售額在第二季度最高;第一、三季度差不多;第四季度最低
#‧computers_accessories、sports_leisure、furniture_decor 三種類在第一季度賣得最好;
# garden_tools、watches_gifts、housewares 三種類在第二季度賣得最好;
#health_beauty、bed_bath_table、auto、cool_stuff 四種類在第三季度賣得最好